Libraries
if (!require("renv")) install.packages("renv")
Loading required package: renv
Attaching package: ‘renv’
The following objects are masked from ‘package:stats’:
embed, update
The following objects are masked from ‘package:utils’:
history, upgrade
The following objects are masked from ‘package:base’:
autoload, load, remove
library(renv)
renv::restore()
- The library is already synchronized with the lockfile.
library(here)
here() starts at C:/Users/Marcony1/OneDrive - Fundacion Universidad de las Americas Puebla/Documents/MDS/Block 6/DSCI 532/DSCI_532_individual-assignment_marcony1
library(dplyr)
Warning: package ‘dplyr’ was built under R version 4.3.3
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(readr)
Warning: package ‘readr’ was built under R version 4.3.3
Read data
zip_file <- here("data", "raw", "iter_00_cpv2020_csv.zip")
temp_dir <- here("temp")
dir.create(temp_dir, showWarnings = FALSE)
unzip(zip_file, files = c("iter_00_cpv2020/conjunto_de_datos/conjunto_de_datos_iter_00CSV20.csv", "iter_00_cpv2020/diccionario_datos/diccionario_datos_iter_00CSV20.csv"), exdir = temp_dir)
data_path <- here(temp_dir,
"iter_00_cpv2020",
"conjunto_de_datos",
"conjunto_de_datos_iter_00CSV20.csv")
dict_path <- here(temp_dir,
"iter_00_cpv2020",
"diccionario_datos",
"diccionario_datos_iter_00CSV20.csv")
info_dict <- read_csv(dict_path)
New names:Rows: 290 Columns: 10── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (6): ...1, ...2, ...3, ...4, ...5, ...6
lgl (4): ...7, ...8, ...9, ...10
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- read_csv(data_path)
Rows: 195662 Columns: 286── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (283): ENTIDAD, NOM_ENT, MUN, NOM_MUN, LOC, NOM_LOC, LONGITUD, LATITUD, ALTITUD, POBFEM, POBMAS, P_0A2, P_0A2_F, P_0A2_M, P_3YMAS, P_3YMAS_F, P_3Y...
dbl (3): POBTOT, VIVTOT, TVIVHAB
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
unlink(temp_dir, recursive = TRUE)
# Exporting dictionary file
write_csv(info_dict,
here("data", "raw", "diccionario_datos_iter_00CSV20.csv"))
Exploration
head(df)
head(info_dict)
clean_info_dict <- info_dict[-c(1:3), ]
names(clean_info_dict) <- clean_info_dict[1, ]
clean_info_dict <- clean_info_dict[-1,]
clean_info_dict <- clean_info_dict[, -c(7:10)]
clean_info_dict
unique_states <- df |>
distinct(NOM_ENT)
unique_states
Selecting rows that we’ll analyze
rows_to_include <- c(1:12, 53:132, 136:140, 147, 155:211, 220:232)
filtered_data <- clean_info_dict |>
filter(row_number() %in% rows_to_include) |>
pull(4)
filtered_data
[1] "ENTIDAD" "NOM_ENT" "MUN" "NOM_MUN" "LOC" "NOM_LOC" "LONGITUD" "LATITUD" "ALTITUD" "POBTOT" "POBFEM"
[12] "POBMAS" "REL_H_M" "POB0_14" "POB15_64" "POB65_MAS" "P_0A4" "P_0A4_F" "P_0A4_M" "P_5A9" "P_5A9_F" "P_5A9_M"
[23] "P_10A14" "P_10A14_F" "P_10A14_M" "P_15A19" "P_15A19_F" "P_15A19_M" "P_20A24" "P_20A24_F" "P_20A24_M" "P_25A29" "P_25A29_F"
[34] "P_25A29_M" "P_30A34" "P_30A34_F" "P_30A34_M" "P_35A39" "P_35A39_F" "P_35A39_M" "P_40A44" "P_40A44_F" "P_40A44_M" "P_45A49"
[45] "P_45A49_F" "P_45A49_M" "P_50A54" "P_50A54_F" "P_50A54_M" "P_55A59" "P_55A59_F" "P_55A59_M" "P_60A64" "P_60A64_F" "P_60A64_M"
[56] "P_65A69" "P_65A69_F" "P_65A69_M" "P_70A74" "P_70A74_F" "P_70A74_M" "P_75A79" "P_75A79_F" "P_75A79_M" "P_80A84" "P_80A84_F"
[67] "P_80A84_M" "P_85YMAS" "P_85YMAS_F" "P_85YMAS_M" "PROM_HNV" "PNACENT" "PNACENT_F" "PNACENT_M" "PNACOE" "PNACOE_F" "PNACOE_M"
[78] "PRES2015" "PRES2015_F" "PRES2015_M" "PRESOE15" "PRESOE15_F" "PRESOE15_M" "P3YM_HLI" "P3YM_HLI_F" "P3YM_HLI_M" "P3HLINHE" "P3HLINHE_F"
[89] "P3HLINHE_M" "P3HLI_HE" "P3HLI_HE_F" "P3HLI_HE_M" "PHOG_IND" "POB_AFRO" "POB_AFRO_F" "POB_AFRO_M" "PCON_DISC" "PCON_LIMI" "PSIND_LIM"
[100] "P3A5_NOA" "P3A5_NOA_F" "P3A5_NOA_M" "P6A11_NOA" "P6A11_NOAF" "P6A11_NOAM" "P12A14NOA" "P12A14NOAF" "P12A14NOAM" "P15A17A" "P15A17A_F"
[111] "P15A17A_M" "P18A24A" "P18A24A_F" "P18A24A_M" "P8A14AN" "P8A14AN_F" "P8A14AN_M" "P15YM_AN" "P15YM_AN_F" "P15YM_AN_M" "P15YM_SE"
[122] "P15YM_SE_F" "P15YM_SE_M" "P15PRI_IN" "P15PRI_INF" "P15PRI_INM" "P15PRI_CO" "P15PRI_COF" "P15PRI_COM" "P15SEC_IN" "P15SEC_INF" "P15SEC_INM"
[133] "P15SEC_CO" "P15SEC_COF" "P15SEC_COM" "P18YM_PB" "P18YM_PB_F" "P18YM_PB_M" "GRAPROES" "GRAPROES_F" "GRAPROES_M" "PEA" "PEA_F"
[144] "PEA_M" "PE_INAC" "PE_INAC_F" "PE_INAC_M" "POCUPADA" "POCUPADA_F" "POCUPADA_M" "PDESOCUP" "PDESOCUP_F" "PDESOCUP_M" "PSINDER"
[155] "PDER_SS" "P12YM_SOLT" "P12YM_CASA" "P12YM_SEPA" "PCATOLICA" "PRO_CRIEVA" "POTRAS_REL" "PSIN_RELIG" "TOTHOG" "HOGJEF_F" "HOGJEF_M"
[166] "POBHOG" "PHOGJEF_F" "PHOGJEF_M"
selected_df <- df |>
select(filtered_data)
selected_df
Exporting as parquet
# Export wrangled data as parquet file
table <- arrow::Table$create(selected_df)
output_dir <- here("data", "processed", "parquet_data")
arrow::write_dataset(table, output_dir, partitioning = c("NOM_ENT", "ENTIDAD"), existing_data_behavior = "overwrite")
Reading parquet
ds <- open_dataset(here("data", "processed", "parquet_data")) |>
collect()
ds
ds_puebla <- open_dataset(here("data", "processed", "parquet_data")) |>
filter(NOM_ENT=="Puebla") |>
collect()
ds_puebla
ds_yucatan <- open_dataset(here("data", "processed", "parquet_data")) |>
filter(NOM_ENT=="Yucatán") |>
collect()
ds_yucatan
ds_nuevo_leon <- open_dataset(here("data", "processed", "parquet_data")) |>
filter(NOM_ENT=="Nuevo León") |>
collect()
ds_nuevo_leon
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCmF1dGhvcjogTWFyY28gUG9sbyBCcmF2byBNb250aWVsDQpkYXRlOiAyMDIwLTA0LTIxDQotLS0NCg0KIyMjIExpYnJhcmllcw0KDQpgYGB7cn0NCmlmICghcmVxdWlyZSgicmVudiIpKSBpbnN0YWxsLnBhY2thZ2VzKCJyZW52IikNCmxpYnJhcnkocmVudikNCnJlbnY6OnJlc3RvcmUoKQ0KbGlicmFyeShoZXJlKQ0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkocmVhZHIpDQpsaWJyYXJ5KGFycm93KQ0KYGBgDQoNCiMjIyBSZWFkIGRhdGENCg0KYGBge3J9DQp6aXBfZmlsZSA8LSBoZXJlKCJkYXRhIiwgInJhdyIsICJpdGVyXzAwX2NwdjIwMjBfY3N2LnppcCIpDQpgYGANCg0KYGBge3J9DQp0ZW1wX2RpciA8LSBoZXJlKCJ0ZW1wIikNCmRpci5jcmVhdGUodGVtcF9kaXIsIHNob3dXYXJuaW5ncyA9IEZBTFNFKQ0KDQp1bnppcCh6aXBfZmlsZSwgZmlsZXMgPSBjKCJpdGVyXzAwX2NwdjIwMjAvY29uanVudG9fZGVfZGF0b3MvY29uanVudG9fZGVfZGF0b3NfaXRlcl8wMENTVjIwLmNzdiIsICJpdGVyXzAwX2NwdjIwMjAvZGljY2lvbmFyaW9fZGF0b3MvZGljY2lvbmFyaW9fZGF0b3NfaXRlcl8wMENTVjIwLmNzdiIpLCBleGRpciA9IHRlbXBfZGlyKQ0KYGBgDQoNCmBgYHtyfQ0KDQpkYXRhX3BhdGggPC0gaGVyZSh0ZW1wX2RpciwNCiAgICAgICAgICAgICAgICAgIml0ZXJfMDBfY3B2MjAyMCIsDQogICAgICAgICAgICAgICAgICJjb25qdW50b19kZV9kYXRvcyIsDQogICAgICAgICAgICAgICAgICJjb25qdW50b19kZV9kYXRvc19pdGVyXzAwQ1NWMjAuY3N2IikNCg0KZGljdF9wYXRoIDwtIGhlcmUodGVtcF9kaXIsDQogICAgICAgICAgICAgICAgICJpdGVyXzAwX2NwdjIwMjAiLA0KICAgICAgICAgICAgICAgICAiZGljY2lvbmFyaW9fZGF0b3MiLA0KICAgICAgICAgICAgICAgICAiZGljY2lvbmFyaW9fZGF0b3NfaXRlcl8wMENTVjIwLmNzdiIpDQoNCmluZm9fZGljdCA8LSByZWFkX2NzdihkaWN0X3BhdGgpDQpkZiA8LSByZWFkX2NzdihkYXRhX3BhdGgpDQoNCg0KdW5saW5rKHRlbXBfZGlyLCByZWN1cnNpdmUgPSBUUlVFKQ0KYGBgDQoNCmBgYHtyfQ0KIyBFeHBvcnRpbmcgZGljdGlvbmFyeSBmaWxlDQp3cml0ZV9jc3YoaW5mb19kaWN0LA0KICAgICAgICAgIGhlcmUoImRhdGEiLCAicmF3IiwgImRpY2Npb25hcmlvX2RhdG9zX2l0ZXJfMDBDU1YyMC5jc3YiKSkNCg0KYGBgDQoNCiMjIyBFeHBsb3JhdGlvbg0KDQpgYGB7cn0NCmhlYWQoZGYpDQpoZWFkKGluZm9fZGljdCkNCmBgYA0KDQpgYGB7cn0NCmNsZWFuX2luZm9fZGljdCA8LSBpbmZvX2RpY3RbLWMoMTozKSwgXQ0KbmFtZXMoY2xlYW5faW5mb19kaWN0KSA8LSBjbGVhbl9pbmZvX2RpY3RbMSwgXQ0KY2xlYW5faW5mb19kaWN0IDwtIGNsZWFuX2luZm9fZGljdFstMSxdDQpjbGVhbl9pbmZvX2RpY3QgPC0gY2xlYW5faW5mb19kaWN0WywgLWMoNzoxMCldDQoNCg0KY2xlYW5faW5mb19kaWN0DQpgYGANCg0KYGBge3J9DQp1bmlxdWVfc3RhdGVzIDwtIGRmIHw+IA0KICAgICAgZGlzdGluY3QoTk9NX0VOVCkNCg0KdW5pcXVlX3N0YXRlcw0KYGBgDQoNCiMjIyBTZWxlY3Rpbmcgcm93cyB0aGF0IHdlJ2xsIGFuYWx5emUNCg0KYGBge3J9DQpyb3dzX3RvX2luY2x1ZGUgPC0gYygxOjEyLCA1MzoxMzIsIDEzNjoxNDAsIDE0NywgMTU1OjIxMSwgMjIwOjIzMikNCg0KZmlsdGVyZWRfZGF0YSA8LSBjbGVhbl9pbmZvX2RpY3QgfD4gDQogICAgICBmaWx0ZXIocm93X251bWJlcigpICVpbiUgcm93c190b19pbmNsdWRlKSB8PiANCiAgICAgIHB1bGwoNCkNCg0KZmlsdGVyZWRfZGF0YQ0KYGBgDQoNCmBgYHtyfQ0Kc2VsZWN0ZWRfZGYgPC0gZGYgfD4gDQogICAgICBzZWxlY3QoZmlsdGVyZWRfZGF0YSkNCg0Kc2VsZWN0ZWRfZGYNCmBgYA0KDQojIyMgRXhwb3J0aW5nIGFzIHBhcnF1ZXQNCg0KYGBge3J9DQojIEV4cG9ydCB3cmFuZ2xlZCBkYXRhIGFzIHBhcnF1ZXQgZmlsZQ0KdGFibGUgPC0gYXJyb3c6OlRhYmxlJGNyZWF0ZShzZWxlY3RlZF9kZikNCg0Kb3V0cHV0X2RpciA8LSBoZXJlKCJkYXRhIiwgInByb2Nlc3NlZCIsICJwYXJxdWV0X2RhdGEiKQ0KDQphcnJvdzo6d3JpdGVfZGF0YXNldCh0YWJsZSwgb3V0cHV0X2RpciwgcGFydGl0aW9uaW5nID0gYygiTk9NX0VOVCIsICJFTlRJREFEIiksIGV4aXN0aW5nX2RhdGFfYmVoYXZpb3IgPSAib3ZlcndyaXRlIikNCmBgYA0KDQojIyMgUmVhZGluZyBwYXJxdWV0DQoNCmBgYHtyfQ0KZHMgPC0gb3Blbl9kYXRhc2V0KGhlcmUoImRhdGEiLCAicHJvY2Vzc2VkIiwgInBhcnF1ZXRfZGF0YSIpKSB8PiANCiAgICAgICAgY29sbGVjdCgpDQoNCmRzDQpgYGANCg0KYGBge3J9DQpkc19wdWVibGEgPC0gb3Blbl9kYXRhc2V0KGhlcmUoImRhdGEiLCAicHJvY2Vzc2VkIiwgInBhcnF1ZXRfZGF0YSIpKSB8Pg0KICAgIGZpbHRlcihOT01fRU5UPT0iUHVlYmxhIikgfD4gDQogICAgY29sbGVjdCgpDQoNCmRzX3B1ZWJsYQ0KYGBgDQoNCmBgYHtyfQ0KZHNfeXVjYXRhbiA8LSBvcGVuX2RhdGFzZXQoaGVyZSgiZGF0YSIsICJwcm9jZXNzZWQiLCAicGFycXVldF9kYXRhIikpIHw+DQogICAgZmlsdGVyKE5PTV9FTlQ9PSJZdWNhdMOhbiIpIHw+IA0KICAgIGNvbGxlY3QoKQ0KDQpkc195dWNhdGFuDQpgYGANCg0KYGBge3J9DQpkc19udWV2b19sZW9uIDwtIG9wZW5fZGF0YXNldChoZXJlKCJkYXRhIiwgInByb2Nlc3NlZCIsICJwYXJxdWV0X2RhdGEiKSkgfD4NCiAgICBmaWx0ZXIoTk9NX0VOVD09Ik51ZXZvIExlw7NuIikgfD4gDQogICAgY29sbGVjdCgpDQoNCmRzX251ZXZvX2xlb24NCmBgYA0K